library(tidyverse)
library(tigris)
library(censusapi)
library(sf)
library(mapview)
library(plotly)
library(lehdr)
options(
tigris_class = "sf",
tigris_use_cache = T
)
My first analysis looks at the median household income of each block group in San Jose and how it correlates to the percent of residents who have been staying completely at home in the past three days.
Sys.setenv(CENSUS_KEY=" 6c05445ae84c23e1c62fd91756d0f56f51ae94ef")
# below code creates a "lookup" table that will be used later on
acs_vars <-
listCensusMetadata(
name = "2018/acs/acs5",
type = "variables"
)
# following code loads in census data and processes it to be more readable
sj_median_income_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "B19013_001E",
key = "6c05445ae84c23e1c62fd91756d0f56f51ae94ef"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
rename(
Median_Income = B19013_001E
) %>%
filter(!is.na(Median_Income)) %>%
left_join(sj_blockgroups, by = c("blockgroup" = "GEOID")) %>% #this code gives each blockgroup a district designation
dplyr::select(-geometry) %>% # comment out this line if geometry is desired
filter(
!is.na(DISTRICTS)
) %>%
# this code joins our census data with the social distancing data, processed as shown below
left_join(sj_socialdistancing %>%
filter(date > max(date)-3) %>%
group_by(origin_census_block_group) %>%
summarize(
completely_home_device_count = sum(completely_home_device_count),
device_count = sum(device_count)) %>%
mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1)),
by = c("blockgroup" = "origin_census_block_group")
) %>%
filter(
!is.na(device_count)
)
# now we can create a scatter plot of median income v % completely at home
sj_median_income_by_block %>%
filter(Median_Income > 0) %>%
ggplot(aes(
x = Median_Income,
y = `% Completely at Home`
)) +
geom_point() +
geom_smooth(method=lm) +
labs(
x = "Median Household Income",
y = "% residents completely at home in past 3 days",
title = "San Jose: Social Distancing and Median Household Income"
)
My second analysis looks at the percent of households below the federal poverty line in each block group and how this correlates with the percent of residents completly at home in the past three days.
sj_poverty_by_block <-
# first we need to load census data. I'm loading in a table of poverty status by household type
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B17017)",
key = "6c05445ae84c23e1c62fd91756d0f56f51ae94ef"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
# The following code shows my process of selecting the right variables from the census data
gather(
key = code,
value = number,
- blockgroup
) %>%
left_join(acs_vars %>% select_if(names(.) %in% c("name", "label")),
by = c("code" = "name")) %>%
# at this point I was able to identify the two variables I needed for analysis
filter(
code %in% c("B17017_001E", "B17017_002E")
) %>%
select(-c("label")) %>%
spread(code, number) %>%
rename(
Households_Total = B17017_001E,
Households_Under_Povertyline = B17017_002E
) %>%
# now we can add a column for % of households below poverty line and combine it with the sj_socialdistancing df
mutate(
Households_Total = as.numeric(Households_Total),
Households_Under_Povertyline = as.numeric(Households_Under_Povertyline),
`% households below poverty line` = Households_Under_Povertyline / Households_Total * 100
) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
) %>%
filter(
!is.na(device_count) # this takes out block groups that aren't in SJ
) %>%
mutate(
log_of_poverty = log(as.numeric(`% households below poverty line`))
)
# now we can look at the relationship between % completely at home and % under poverty line on a scatter plot
sj_poverty_by_block %>% ggplot(aes(
x = `% households below poverty line`,
y = `% Completely at Home`
)) +
geom_point() +
labs(
y = "% residents completely at home in past 3 days",
x = log(as.numeric("% households below poverty line")),
title = "San Jose: Social Distancing and Poverty Status"
)
The results are heteroscedastic, so we take the log of the x-variable
sj_poverty_by_block %>%
filter(
log_of_poverty > 0 # taking out blockgroups with no poverty
) %>%
ggplot(aes(
x = log_of_poverty,
y = `% Completely at Home`
)) +
geom_point() +
geom_smooth(method=lm) +
labs(
y = "Percent of devices completely at home in past 3 days",
x = log(as.numeric("Percent of households below poverty line")),
title = "San Jose: Social Distancing and Poverty Status"
)
Using area median income (AMI) is a more appropriate way to measure low-income status, as the cost of living is substantially higher than the federal average in San Jose. This analysis plots percent of devices entirely at home against percent of households with incomes 50% and 80% percent of Santa Clara County AMI.
sj_ami_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B19001)",
key = "6c05445ae84c23e1c62fd91756d0f56f51ae94ef"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
group_by(blockgroup) %>%
summarize(
Total = B19001_001E,
`Under 75,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E),
#sum(lapply(2:12, function(x) as.name(paste0("B19001_00",x,"E"))))
`Under 100,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E)
) %>%
mutate(
`% under 75,000` = `Under 75,000` / Total * 100,
`% under 100,000` = `Under 100,000` / Total * 100
) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
) %>%
filter(!is.na(device_count))
cor(sj_ami_by_block$`% under 75,000`, sj_ami_by_block$`% Completely at Home`)
## [1] -0.4182122
sj_ami_by_block %>% ggplot(aes(
x = `% under 75,000`,
y = `% Completely at Home`
)) +
geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with incomes under $75,000 (50% AMI) annually",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Households Above and Below 50% AMI"
)
sj_ami_by_block %>% ggplot(aes(
x = `% under 100,000`,
y = `% Completely at Home`
)) +
geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with incomes under $100,000 (80% AMI) annually",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Households Above and Below 80% AMI"
)
In this part of my analysis I load in a LODES dataset on SJ in order to compare percent of devices at home to a number of economic and employment factors. The
ca_rac <-
grab_lodes(
state = "ca",
year = 2017,
lodes_type = "rac",
job_type = "JT01",
segment = "S000",
state_part = "main",
agg_geo = "bg"
) %>%
left_join(sj_blockgroups, by = c("h_bg" = "GEOID")) %>%
filter(
!is.na(DISTRICTS)
) %>%
mutate(
percent_healthcare_workers = CNS18 / C000 * 100
) %>%
dplyr::select(h_bg, DISTRICTS, percent_healthcare_workers) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income), by = c("h_bg" = "blockgroup")) %>%
filter(
!is.na(device_count)
)
ca_rac %>% ggplot(aes(
x = percent_healthcare_workers,
y = `% Completely at Home`
)) +
geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of workers in health care and social assistance",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Health Care Workers"
)